In [ ]:
def preprocess_cornell():
import os
import re
import random
neg_dir = 'txt_sentoken/neg';
pos_dir = 'txt_sentoken/pos';
neg_files = [os.path.join(neg_dir, f) for f in os.listdir(neg_dir) if '.txt' in f]
pos_files = [os.path.join(pos_dir, f) for f in os.listdir(pos_dir) if '.txt' in f]
neg_examples = []
for f_path in neg_files:
with open(f_path) as f:
text = f.read();
clean = re.sub(r'\W+', ' ', text)
neg_examples += [clean]
pos_examples = []
for f_path in pos_files:
with open(f_path) as f:
text = f.read();
clean = re.sub(r'\W+', ' ', text)
pos_examples += [clean]
lines = ["pos\t"+l+"\n" for l in pos_examples] + ["neg\t"+l+"\n" for l in neg_examples]
random.shuffle(lines)
with open('movie_reviews.txt', 'wb') as f:
f.writelines(lines)
We'll begin by loading the processed file:
In [2]:
import urllib
class Example:
def __init__(self, text, label):
self.text = text
self.label = label
self.features = None
def __repr__(self):
return self.label + "\t" + self.text
def __repr__(self):
return self.label + "\t" + self.text
link = "https://dl.dropboxusercontent.com/u/9015381/notebook/movie_reviews.txt"
f = urllib.urlopen(link)
examples = [ Example(e.split("\t")[1], e.split("\t")[0]) for e in f.read().split("\n") if e ]
Now extract bag of words features:
In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1, binary=True) #we only use binary features
corpus = [e.text for e in examples]
X = vectorizer.fit_transform(corpus)
from sklearn import preprocessing
label_vector = [e.label for e in examples]
le = preprocessing.LabelEncoder()
le.fit(label_vector)
print list(le.classes_)
y = le.transform(label_vector)
print y
Now we can train a simple logistic regression model:
In [9]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
kfolds = cross_validation.KFold(len(y), 10)
print kfolds
i=0
for train_idx, test_idx in kfolds:
print 'fold', i,
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
model = LogisticRegression()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print 'score:', score
i += 1
Let's look at how the features are weighted:
In [10]:
model = LogisticRegression()
model.fit(X, y)
weights = model.coef_.tolist()[0] #tolist() returns a list of lists (ie. a 1xn array)
# the following is a tad confusing. By nature of our ['neg', 'pos'] labels, 'neg' is
# before 'pos' by virtue of alphabetical ordering. Hence, for weights that make the
# estimate tend towards 'neg' or in this case 0, we look for the negative weights
# and sort by ascending order.
#
# the reverse is true for positive sentiment.
most_neg_idx = sorted(range(len(weights)), key=lambda k:weights[k])[:5]
most_pos_idx = sorted(range(len(weights)), key=lambda k:weights[k], reverse=True)[:5]
features = vectorizer.get_feature_names()
print 'most positive words:'
for idx in most_pos_idx:
print features[idx], ':', weights[idx]
print
print 'most negative words:'
for idx in most_neg_idx:
print features[idx], ':', weights[idx]
Suppose now we get some new data and would like to predict:
In [11]:
new_data = """
With funny work from Chris Pratt as Theodore's boss and fine performances from Amy
Adams as his mousey friend and Phoenix himself, this had the capacity to become
darker but Jonze isn't interested in being nasty.
"""
# clean it
new_data = new_data.replace("\n", '')
# turn it into features
X_new = vectorizer.transform([new_data])
# predict it
y_estimate = model.predict_proba(X_new)
neg_prob = y_estimate.tolist()[0][0]
pos_prob = y_estimate.tolist()[0][1]
print 'positive:', pos_prob
print 'negative:', neg_prob
Let's try something more confusing:
In [12]:
new_data = """
It wasn't terrible. I thought the performance would be awful but it turned out to be not bad.
"""
# clean it
new_data = new_data.replace("\n", '')
# turn it into features
X_new = vectorizer.transform([new_data])
# predict it
y_estimate = model.predict_proba(X_new)
neg_prob = y_estimate.tolist()[0][0]
pos_prob = y_estimate.tolist()[0][1]
print 'positive:', pos_prob
print 'negative:', neg_prob
Since the feature representation is unigram bag of words, we are essentially disregarding any sequential relationships between the words and only going by the occurances of words in the vocabulary of the training set. In this example, the negative sentiment words occur but are negated. Because of the negation, a human reader is able to "reverse" the connotations of the inividual words, whereas our model becomes very confused.
In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import numpy as np
import re
supported_labels = ['pos', 'neg']
le = preprocessing.LabelEncoder()
le.fit(supported_labels)
Out[14]:
Suppose we begin with some seed example:
In [15]:
new_examples = ['I like pie', 'i dont like chicken']
new_labels = ['pos', 'neg']
new_vectorizer = CountVectorizer(min_df=1, binary=True) #we only use binary features
new_X = new_vectorizer.fit_transform(new_examples)
print 'seed X:', new_X
new_y = le.transform(new_labels)
print 'seed y:', new_y
model = SGDClassifier(loss = 'log') #we want to use a logistic regression loss function
model.partial_fit(new_X, new_y, classes = new_y)
print 'seed coefficients:', model.coef_
vocabulary = dict([(name, idx) for idx, name in enumerate(new_vectorizer.get_feature_names())])
print 'seed vocabulary:', vocabulary
We have our basic model. Now suppose we get a new example, but this example contains unknown words:
In [16]:
new_examples = ['Johnny kind of likes chicken']
new_labels = ['pos']
new_vectorizer = CountVectorizer(min_df=1, binary=True)
new_vectorizer.fit(new_examples)
new_words = new_vectorizer.get_feature_names()
print 'new words:', new_words
unknown_words = list(set(new_words) - set(vocabulary.keys()))
print 'unknown words:', unknown_words
for w in unknown_words:
vocabulary[w] = len(vocabulary)
# enlarge our vocabulary and our weight vector
print 'new vocabulary:', vocabulary
new_weights = np.zeros(shape=(model.coef_.shape[0], len(unknown_words)))
model.coef_ = np.concatenate((model.coef_, new_weights), axis=1)
vectorizer = CountVectorizer(min_df=0, binary=True, vocabulary = vocabulary)
new_X = vectorizer.fit_transform(new_examples)
new_y = le.transform(new_labels)
model.partial_fit(new_X, new_y)
print model.coef_
We notice that the weight vector was properly (eg. incrementally) updated via stochastic gradient descent.